@@ -19,7 +19,7 @@ module Agents |
||
19 | 19 |
|
20 | 20 |
`url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape) |
21 | 21 |
|
22 |
- The `type` value can be `xml`, `html`, or `json`. |
|
22 |
+ The `type` value can be `xml`, `html`, `json`, or `text`. |
|
23 | 23 |
|
24 | 24 |
To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes. |
25 | 25 |
|
@@ -40,6 +40,28 @@ module Agents |
||
40 | 40 |
"description": { "path": "results.data[*].description" } |
41 | 41 |
} |
42 | 42 |
|
43 |
+ When parsing text, each sub-hash should contain a `regexp` and `index`. Output text is matched against the regular expression repeatedly from the beginning through to the end, collecting a captured group specified by `index` in each match. Each index should be either an integer or a string name which corresponds to `(?<_name_>...)`. For example, to parse lines of `_word_: _definition_`, the following should work: |
|
44 |
+ |
|
45 |
+ "extract": { |
|
46 |
+ "word": { "regexp": "^(.+?): (.+)$", index: 1 }, |
|
47 |
+ "definition": { "regexp": "^(.+?): (.+)$", index: 2 }, |
|
48 |
+ } |
|
49 |
+ |
|
50 |
+ Or if you prefer names to numbers for index: |
|
51 |
+ |
|
52 |
+ "extract": { |
|
53 |
+ "word": { "regexp": "^(?<word>.+?): (?<definition>.+)$", index: 'word' }, |
|
54 |
+ "definition": { "regexp": "^(?<word>.+?): (?<definition>.+)$", index: 'definition' }, |
|
55 |
+ } |
|
56 |
+ |
|
57 |
+ To extract the whole content as one event: |
|
58 |
+ |
|
59 |
+ "extract": { |
|
60 |
+ "content": { "regexp": "\A(?m:.)*\z", index: 0 }, |
|
61 |
+ } |
|
62 |
+ |
|
63 |
+ Beware that `.` does not match the newline character (LF) unless the `m` flag is in effect, and `^`/`$` basically match every line beginning/end. See [this document](http://ruby-doc.org/core-#{RUBY_VERSION}/doc/regexp_rdoc.html) to learn the regular expression variant used in this service. |
|
64 |
+ |
|
43 | 65 |
Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful. |
44 | 66 |
|
45 | 67 |
Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `"username:password"`, or `["username", "password"]`. |
@@ -140,7 +162,15 @@ module Agents |
||
140 | 162 |
else |
141 | 163 |
output = {} |
142 | 164 |
interpolated['extract'].each do |name, extraction_details| |
143 |
- if extraction_type == "json" |
|
165 |
+ case extraction_type |
|
166 |
+ when "text" |
|
167 |
+ regexp = Regexp.new(extraction_details['regexp']) |
|
168 |
+ result = [] |
|
169 |
+ doc.scan(regexp) { |
|
170 |
+ result << Regexp.last_match[extraction_details['index']] |
|
171 |
+ } |
|
172 |
+ log "Extracting #{extraction_type} at #{regexp}: #{result}" |
|
173 |
+ when "json" |
|
144 | 174 |
result = Utils.values_at(doc, extraction_details['path']) |
145 | 175 |
log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}" |
146 | 176 |
else |
@@ -253,10 +283,13 @@ module Agents |
||
253 | 283 |
|
254 | 284 |
def extraction_type |
255 | 285 |
(interpolated['type'] || begin |
256 |
- if interpolated['url'] =~ /\.(rss|xml)$/i |
|
286 |
+ case interpolated['url'] |
|
287 |
+ when /\.(rss|xml)$/i |
|
257 | 288 |
"xml" |
258 |
- elsif interpolated['url'] =~ /\.json$/i |
|
289 |
+ when /\.json$/i |
|
259 | 290 |
"json" |
291 |
+ when /\.(txt|text)$/i |
|
292 |
+ "text" |
|
260 | 293 |
else |
261 | 294 |
"html" |
262 | 295 |
end |
@@ -271,6 +304,8 @@ module Agents |
||
271 | 304 |
JSON.parse(data) |
272 | 305 |
when "html" |
273 | 306 |
Nokogiri::HTML(data) |
307 |
+ when "text" |
|
308 |
+ data |
|
274 | 309 |
else |
275 | 310 |
raise "Unknown extraction type #{extraction_type}" |
276 | 311 |
end |
@@ -398,6 +398,58 @@ describe Agents::WebsiteAgent do |
||
398 | 398 |
event.payload['response']['title'].should == "hello!" |
399 | 399 |
end |
400 | 400 |
end |
401 |
+ |
|
402 |
+ describe "text parsing" do |
|
403 |
+ before do |
|
404 |
+ stub_request(:any, /text-site/).to_return(body: <<-EOF, status: 200) |
|
405 |
+water: wet |
|
406 |
+fire: hot |
|
407 |
+ EOF |
|
408 |
+ site = { |
|
409 |
+ 'name' => 'Some Text Response', |
|
410 |
+ 'expected_update_period_in_days' => '2', |
|
411 |
+ 'type' => 'text', |
|
412 |
+ 'url' => 'http://text-site.com', |
|
413 |
+ 'mode' => 'on_change', |
|
414 |
+ 'extract' => { |
|
415 |
+ 'word' => { 'regexp' => '^(.+?): (.+)$', index: 1 }, |
|
416 |
+ 'property' => { 'regexp' => '^(.+?): (.+)$', index: 2 }, |
|
417 |
+ } |
|
418 |
+ } |
|
419 |
+ @checker = Agents::WebsiteAgent.new(name: 'Text Site', options: site) |
|
420 |
+ @checker.user = users(:bob) |
|
421 |
+ @checker.save! |
|
422 |
+ end |
|
423 |
+ |
|
424 |
+ it "works with regexp" do |
|
425 |
+ @checker.options = @checker.options.merge('extract' => { |
|
426 |
+ 'word' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'word' }, |
|
427 |
+ 'property' => { 'regexp' => '^(?<word>.+?): (?<property>.+)$', index: 'property' }, |
|
428 |
+ }) |
|
429 |
+ |
|
430 |
+ lambda { |
|
431 |
+ @checker.check |
|
432 |
+ }.should change { Event.count }.by(2) |
|
433 |
+ |
|
434 |
+ event1, event2 = Event.last(2) |
|
435 |
+ event1.payload['word'].should == 'water' |
|
436 |
+ event1.payload['property'].should == 'wet' |
|
437 |
+ event2.payload['word'].should == 'fire' |
|
438 |
+ event2.payload['property'].should == 'hot' |
|
439 |
+ end |
|
440 |
+ |
|
441 |
+ it "works with regexp with named capture" do |
|
442 |
+ lambda { |
|
443 |
+ @checker.check |
|
444 |
+ }.should change { Event.count }.by(2) |
|
445 |
+ |
|
446 |
+ event1, event2 = Event.last(2) |
|
447 |
+ event1.payload['word'].should == 'water' |
|
448 |
+ event1.payload['property'].should == 'wet' |
|
449 |
+ event2.payload['word'].should == 'fire' |
|
450 |
+ event2.payload['property'].should == 'hot' |
|
451 |
+ end |
|
452 |
+ end |
|
401 | 453 |
end |
402 | 454 |
|
403 | 455 |
describe "#receive" do |